
* * * PREP INDIVIDUAL TAX DATA, CANTON OF OW, 2001-2010  * * *
* (data originially exported by tax administration in Fall 2012 or Spring 2013)

// this file prepares the data and creates moving date variable and indicators needed
// zuzugjahr is based on the official moving date, firstyear is simply used to indicate first appearance in my data

// author Isabel Martinez

// date Sep 29 2015
// updated July 2016
// updated October 2016
// updated October 2020

version 13.1 

clear all
set more off
cap log close
cap graph drop _all
cap clear matrix 


use "/Volumes/OW_data/my_owdat.dta", clear

* drop Erbgemeinschaften und Eigntümerschaften
tab sex
xttab sex
tab zuzugdatum if sex==8
tab zuzugdatum if sex==8 & zuzugkt != 6 //678 observations with sex==8 & origin out of OW
drop if sex==7 | sex==8

dis _N // 312561

*-------------------------------------------------------------------------------
* DEFINE A GOOD YEAR VARIABLE based on ereigfall, and drop duplicates
*-------------------------------------------------------------------------------
* unterjährige Steuerfplicht 1
gen yr=year(ereigfall) + 1
gen month=month(ereigfall)
gen testyear=year-yr
tab month if testyear==0 // these were all in december, so they're ok
tab month if testyear==-1 // here (1,937 obs) you have all 12 months, probably unterjährige steuerpflicht
bys year: tab month if testyear==-1 
gen unterj1=(testyear==-1)
	label var unterj1 "unterjährige Steuerpflicht"
	note unterj1: unterjährige Steuerpflicht, many of them will be dropped after dropping the duplicates

* just to check whether I got the right definition for unterjährige Steuerpflicht
gen stpflength=ereigbis-ereigvon
	label var stpflength "Steuerpflicht in Tagen"
sum stpflength, de
tab stpflength, mi
gen unterj2=(stpflength<364)
    replace unterj2=1 if stpflength==364 & (year==2004 | year==2008 | year==2012) // this is to take into account the leap years
	label var unterj2 "unterjährige Steuerpflicht"
	note unterj2: unterjährige Steuerpflicht (weniger als 364 (356) Tage) bei Zuzug aus dem Ausland, Wegzug ins Ausland, Tod einer steuerpflichtigen Person, Entlassung aus der Quellensteuer und Eintritt in die ordentliche Steuerpflicht.
tab year unterj2 
tab year unterj1
tab unterj1 unterj2


* deal with duplicates
sort persid year ereigvon ereigbis
drop dup
duplicates tag persid year, gen(dup)
tab dup
bysort year: tab dup
//the duplicates are distributed evenly over the years: around 100 obs (70 in 2006, 128 in 2001) or a bit less than 0.5% of the sample each year

bysort year: tab zivilstand dup
bysort year: tab unterj1 dup
bysort year: tab unterj2 dup
// the duplicates are people who have unterjährige Steuerpflicht after divorce or death of spouse

/* What do those with duplicates look like in terms of income and wealth?
twoway kdensity esteuerbst if month==12&dup==1 || kdensity esteuerbst if month==1&dup==1 
twoway kdensity vsteuerbst if month==12&dup==1 || kdensity vsteuerbst if month==1&dup==1
twoway kdensity esteuerbst if month==12&dup==1&esteuerbst<=300000 || kdensity esteuerbst if month==1&dup==1&esteuerbst<=300000, legend(label (1 "december") label (2 "january"))
twoway kdensity vsteuerbst if month==12&dup==1&vsteuerbst<=600000 || kdensity vsteuerbst if month==1&dup==1&vsteuerbst<=600000, legend(label (1 "december") label (2 "january")) 
// plots suggest that there is no big difference between first and second assesment
*/

* no need to drop full panel if there ever is a duplicate, but replace income vars with missing
gen panel_dup=0
	label var panel_dup "panel had duplicates"
bys persid (dup):replace panel_dup=1 if dup[_N]>0 & dup !=. //7083 changes made

foreach var in 	z110s1	z146s1	z207s1	z243s1	z288s1	z370s1 	z110s2	z146s2	z207s2	z243s2	z288s2	z370s2 	z111s1	z149s1	z208s1	z244s1	z289s1	z375s1 	z111s2	z149s2	z208s2	z244s2	z289s2	z375s2 	z114s1	z150s1	z209s1	z245s1	z290s1	z380s1 	z114s2	z150s2	z209s2	z245s2	z290s2	z381s1 	z115s1	z151s1	z210s1	z246s1	z291s1	z390s1 	z115s2	z151s2	z210s2	z246s2	z291s2	z391s1 	z118s1	z152s1	z211s1	z247s1	z292s1	z400s1 	z118s2	z152s2	z211s2	z247s2	z292s2	z404s1 	z119s1	z160s1	z212s1	z250s1	z293s1	z408s1 	z119s2	z160s2	z212s2	z250s2	z293s2	z412s1 	z120s1	z161s1	z215s1	z251s1	z299s1	z414s1 	z120s2	z161s2	z215s2	z251s2	z299s2	z416s1 	z124s1	z164s1	z216s1	z255s1	z300s1	z419s1 	z124s2	z164s2	z216s2	z255s2	z300s2	z420s1 	z125s1	z168s1	z217s1	z256s1	z305s1	z421s1 	z125s2	z168s2	z217s2	z256s2	z305s2	z422s1 	z126s1	z170s1	z218s1	z260s1	z306s1	z429s1 	z126s2	z170s2	z218s2	z260s2	z306s2	z430s1 	z127s1	z180s1	z220s1	z265s1	z307s1	z431s1 	z127s2	z180s2	z220s2	z265s2	z307s2	z432s1 	z128s1	z181s1	z221s1	z266s1	z308s1	z433s1 	z128s2	z181s2	z221s2	z266s2	z308s2	z435s1 ste_sachb_code	z130s1	z182s1	z222s1	z267s1	z309s1	z439s1 rektnr	z130s2	z182s2	z222s2	z267s2	z309s2	z440s1 	z131s1	z183s1	z223s1	z270s1	z310s1	z441s1 	z131s2	z183s2	z223s2	z270s2	z310s2	z442s1 	z132s1	z189s1	z224s1	z271s1	z311s1	z443s1 	z132s2	z189s2	z224s2	z271s2	z311s2	z445s1 	z133s1	z190s1	z225s1	z272s1	z312s1	z450s1 	z133s2	z190s2	z225s2	z272s2	z312s2	z460s1 fw_code_stp	z134s1	z191s1	z230s1	z273s1	z313s1	z461s1 fw_code_pa	z134s2	z191s2	z230s2	z273s2	z313s2	z462s1 	z135s1	z192s1	z231s1	z274s1	z330s1	z465s1 	z135s2	z192s2	z231s2	z274s2	z330s2	z470s1 	z136s1	z193s1	z232s1	z275s1	z350s1	z471s1 	z136s2	z193s2	z232s2	z275s2	z350s2	z472s1 	z137s1	z194s1	z233s1	z276s1	z351s1	z473s1 	z137s2	z194s2	z233s2	z276s2	z351s2	z480s1 	z138s1	z199s1	z234s1	z280s1	z352s1	z481s1 	z138s2	z199s2	z234s2	z280s2	z352s2 	z139s1	z201s1	z235s1	z281s1	z353s1 	z139s2	z201s2	z235s2	z281s2	z353s2 va_sachb_code z140s1	z202s1	z236s1	z282s1	z354s1 	z140s2	z202s2	z236s2	z282s2	z354s2 z100s1	z141s1	z203s1	z237s1	z284s1	z355s1 z100s2	z141s2	z203s2	z237s2	z284s2	z355s2 z101s1	z142s1	z204s1	z240s1	z285s1	z360s1 z101s2	z142s2	z204s2	z240s2	z285s2	z360s2 z104s1	z143s1	z205s1	z241s1	z286s1	z361s1 z104s2	z143s2	z205s2	z241s2	z286s2	z361s2 z105s1	z145s1	z206s1	z242s1	z287s1	z362s1 z105s2	z145s2	z206s2	z242s2	z287s2	z362s2 {
replace `var' =. if dup>0
}

sort persid year ereigvon ereigbis
egen duptag= tag(persid year dup) //this should tag only one of the duplicates
replace duptag=0 if dup==0

drop if duptag==1 //(638 observations deleted), but still repeated time values within panels from those with more than one duplicate
bys persid (dup):drop if dup[_N]==2 // these few with more than one excess observation: drop whole panel 24 obs deleted

dis _N //311950

drop testyear dup month yr duptag


* xtset the data
xtset persid year
note: this version of the data set excludes duplicates (662 obs), therefore the calculation of the tax revenue will be somewhat underestimated



* SOME CHARACTERISTICS NEEDED TO SET UP THE DATA

* generate an age variable
replace jahrgang=. if jahrgang==1800 | jahrgang==1700 
gen age=year-jahrgang
tab age

//42 obs are younger than 17, but many of them are school students and aprentees around age 14/15/16. 
	// id 83202 is messed up, I guess she's 10 years older, but then also other stuff is strange...
	drop if persid==83202 //11 obs deleted

* generate the number of kids using the fact that deduction z472 is standard for each child
gen snpanzki=z472s1/10000  //ART 54, never changed since 1995
replace snpanzki=0 if snpanzki==.
replace snpanzki=0 if snpanzki<0.1 // (5 real changes made)
replace snpanzki=1 if snpanzki>=0.100000000 & snpanzki<=1 // (106 real changes made)
replace snpanzki=2 if snpanzki>1 & snpanzki<=2 // (40 real changes made)
replace snpanzki=3 if snpanzki>2 & snpanzki<=3 // (11 real changes made)
label var snpanzki "no. of children"

* generate a marital status variable based on tarif_code (because marital status in the data never changes, it only reflects last marital status when data was exported)
gen zivilstand_tarif=0 if tarif_code==0
replace zivilstand_tarif=1 if tarif_code==1
replace zivilstand_tarif=zivilstand if zivilstand_tarif==. & tarif_code==2 // these are the expenditure based, leaving some observations undetermined
	label var zivilstand_tarif "marital status based on tarif_code"
	label values zivilstand_tarif ZIVILSTAND
	note zivilstand_tarif: info taken from tarif code but replaced with zivilstand for expenditure based

* generate yet another civil status variable reflecting the true situation
gen civil_stat = zivilstand
label var civil_stat "civil status reflecting true situation and changes (not for tax purposes!)"
label values civil_stat ZIVILSTAND
    // those who were married before divorced/separated/widowed 
    by persid (year):  replace civil_stat = 1 if civil_stat > 1 & tarif_code==1 
    // those who were single before getting married and have no kids
    by persid (year):  replace civil_stat = 0 if civil_stat ==1 & tarif_code==0 & snpanzki==0
    // those who were single before getting married and have kids:
    by persid (year):  replace civil_stat = 0 if civil_stat ==1 & tarif_code==0 & snpanzki>0
 

tab civil_stat, mi
xttab civil_stat

tab zivilstand_tarif, mi
tab zivilstand, mi
tab zivilstand_tarif zivilstand, mi
bys year : tab zivilstand_tarif zivilstand, mi

// add a dummy for single parents
gen singleparent= (tarif_code >0 & tarif_code <5 & snpanzki >0)

* civil status and kids	
tab zivilstand_tarif zivilstand if snpanzki>0
tab zivilstand sex
tab zivilstand_tarif sex

sort persid year
gen switch = .
label var switch "switched civil status according to tax code for single and married"
by persid (year): replace switch = cond(L.zivilstand_tarif == zivilstand_tarif, 0, 1) // cond: if L.zivilstand_tarif == zivilstand_tarif, replace with 0, otherwise replace with 1
by persid (year): replace switch = . if L.zivilstand_tarif==.
tab switch, mi // only 2% of the sample switch...

bys year:  tab switch, mi 

*-------------------------------------------------------------------
* GENERATE A MOVE-IN VARIABLE
*-------------------------------------------------------------------

* gen move-in variable at year level, based on admin info
gen zuzugjahr_original = year(zuzugdatum) 
label var zuzugjahr_original "move-in year"
note zuzugjahr_original: move-in year based on admin. record for variable "zuzugdatum"


/*
tab zuzugjahr_original if zuzugjahr_original<1901

    move-in |
       year |      Freq.     Percent        Cum.
------------+-----------------------------------
       1699 |         31       46.27       46.27
       1799 |          7       10.45       56.72
       1899 |         28       41.79       98.51
       1900 |          1        1.49      100.00
------------+-----------------------------------
      Total |         67      100.00
*/

//tab jahrgang if zuzugjahr_original==1900 // he's ok
//tab jahrgang if zuzugjahr_original==1899 // 2 actually were born in 1899, the other 26 in 1944
replace zuzugjahr_original=1944 if zuzugjahr_original==1899 & jahrgang==1944
//tab jahrgang if zuzugjahr_original==1799 // they were all born in 1926
replace zuzugjahr_original=1926 if zuzugjahr_original==1799 & jahrgang==1926
//tab jahrgang if zuzugjahr_original==1699 // they were all born in different years, 1909-1958
replace zuzugjahr_original=jahrgang if zuzugjahr_original==1699

xttab zuzugjahr_original


* find first observation of each tax unit
sort persid year zuzugdatum
by persid (year): gen first=(_n==1) //tag first year observation
gen firstyear_raw=year if first==1 
sum firstyear_raw //goes from min2001 to max 2013 OK!
	bysort persid (year): replace firstyear_raw=firstyear_raw[_n-1] if firstyear_raw==. //this produces a cascade effect to fill in the  year for all obs
	label var firstyear_raw "first year of appearance in the data"
tab firstyear_raw, mi //now 73% of obs are first observed in 2001
xttab firstyear_raw // this corresponds to 58% of the panels


gen firstyear=year if first==1 
replace firstyear=zuzugjahr_original if year==2001 & zuzugjahr_original<=2001 //(15408 real changes made)
replace firstyear=zuzugjahr_original if zuzugjahr_original<firstyear // (257989 real changes made)  since this indicates that person was there already
bysort persid (year): replace firstyear=firstyear[_n-1] if firstyear==. //this produces a cascade effect to fill in the  year for all obs
	label var firstyear "first year of appearance in the data, corrected for original move-in record"
	note firstyear: if year==2001  & zuzugjahr_original=<2001, firstyear was replaced with zuzugjahr_original
	note firstyear: if zuzugjahr_original<firstyear, firstear was replaced with zuzugjahr_original, since this indicates that person was there already


*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 	
* * * * * * * HOW DOES FIRSTYEAR COMPARE TO ZUZUGJAHR? * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 	

* * * * distribution of incomers according to the two dates * * * * *
tab firstyear, mi //now 11% of obs are first observed in 2001 -->still too many compared to zuzugjahr_original...
xttab firstyear // this corresponds to 19% of the panels
xttab zuzugjahr_original

* zuzugjahr_original is often missing :-(
tab zuzugjahr_original, mi
codebook zuzugjahr_original // 24996 out of 311939 or 8% missing
replace zuzugjahr_original=99999 if zuzugjahr_original==.
xttab zuzugjahr_original //2810 or 7.24% of people have a missing zuzugjahr_original
replace zuzugjahr_original=. if zuzugjahr_original==99999

* does missing zuzugjahr_original explain the bunch in firstyear?
tab firstyear if zuzugjahr_original==. //zuzugjahr_original is missing for  24,996  obs, and in 93% of these cases, firstyear==2001
twoway kdensity  firstyear || (kdensity zuzugjahr_original) if zuzugjahr_original>=1899, xline(2001)
tab zuzugkt if zuzugjahr_original==., mi
tab zuzugvonbfs if zuzugjahr_original==., mi
xttab firstyear if zuzugjahr_original==. //2504 people were there in 2001 and had no zuzugjahr_original



* when were people born? maybe I could replace the zuzugjahr_original with the birthyear if zuzugjahr_original is missing AND firstyear==2001
xttab jahrgang if zuzugjahr_original==.
graph drop _all
twoway (kdensity jahrgang if zuzugjahr_original==.) || (kdensity jahrgang), legend(label(1 "kdensity Jahrgang: move-in date missing") label(2 "kdensity Jahrgang: all") rows(2)) 
twoway (kdensity zuzugjahr_original) || (kdensity firstyear), xlabel(1900(10)2020, grid) legend(label(1 "kdensity zuzugjahr (original)") label(2 "kdensity first year") rows(2)) name(original)
preserve
  replace firstyear=jahrgang if firstyear==2001 & zuzugjahr_original==.
  twoway (kdensity zuzugjahr_original) || (kdensity firstyear),  xlabel(1900(10)2020, grid) legend(label(1 "kdensity zuzugjahr (original)") label(2 "kdensity first year") rows(2)) name(modified)
  // graphs indicate that this would not fully solve my problem, because too much mass is placed on those born/moving in before 1960
  xttab firstyear //3.5% of obs, 12.8% of panels
restore
xttab firstyear //11% of obs, 19% of panels in 2001


*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 	
* MAKE SENSE OF MISSING ZUZUGJAHR WHERE POSSIBLE!
*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 	
gen zuzugjahr = zuzugjahr_original
// the ones probably born in OW (replace with birthyear)
bys persid (year): replace zuzugjahr = jahrgang if (age[_n==1]==17 | age[_n==1]==18) & zuzugjahr_original==.  //(40 real changes made)
	bys persid (year): replace zuzugjahr = zuzugjahr[_n-1] if zuzugjahr==.  //(240 real changes made)
// widows and widowers (zivilstand==5)
xttab zivilstand if zuzugjahr==.
	tab jahrgang if zivilstand==5 & zuzugjahr==.  //all 5 127 born between 1903 and 1962
	tab firstyear if zivilstand==5 & zuzugjahr==. // 4077 (79%) have 2011 as firstyear
	bys sex: tab firstyear if zivilstand==5 & zuzugjahr==. // 2515 of them are females
				tab firstyear jahrgang if zivilstand==5 & zuzugjahr==. & sex==0 // for widowed men ok to take birthyear as move-in year if firstyear==2001
				tab firstyear jahrgang if zivilstand==5 & zuzugjahr==. & sex==1 // for widowed women ok to take birthyear as move-in year if firstyear==2001
replace zuzugjahr=jahrgang if zivilstand==5 & zuzugjahr==. & firstyear==2001
				tab firstyear if zivilstand==5 & zuzugjahr==. & sex==0 // what to do with remaining 27 widowed men?
				tab firstyear if zivilstand==5 & zuzugjahr==. & sex==1 //  what to do with remaining 1028 widowed women?
				// for methodological consistency, I replace their date with birthyear, but one could as a robustness check use firstyear as zuzugjahr
					//replace zuzugjahr=firstyear if zivilstand==5 & zuzugjahr==. & firstyear!=2001
replace zuzugjahr=jahrgang if zivilstand==5 & zuzugjahr==. //(1050 real changes made)
replace zuzugjahr=firstyear-5 if zivilstand==5 & zuzugjahr==. & jahrgang==. //1 taxpayaer, 5 obs, no jahrgang and no zuzugjahr

xttab zivilstand if zuzugjahr==.

// divorce (zivilstand==4)
	tab jahrgang if zivilstand==4 & zuzugjahr==. // now people are younger than the widows and widowers
	tab firstyear if zivilstand==4 & zuzugjahr==. // again most of them already there in 2001
	bys sex: tab firstyear if zivilstand==4 & zuzugjahr==.
				tab firstyear jahrgang if zivilstand==4 & zuzugjahr==. 
set seed 10021986
replace zuzugjahr=firstyear-floor((55-11+1) * runiform() + 11) if zivilstand==4 & zuzugjahr==. & firstyear==2001 & jahrgang< 1926
replace zuzugjahr=firstyear-floor((35-7+1) * runiform() + 7) if zivilstand==4 & zuzugjahr==. & firstyear==2001 & jahrgang >1935 & jahrgang< 1946
replace zuzugjahr=firstyear-floor((25-7+1) * runiform() + 7) if zivilstand==4 & zuzugjahr==. & firstyear==2001 & jahrgang >1945 & jahrgang< 1956
replace zuzugjahr=firstyear-floor((15-5+1) * runiform() + 5) if zivilstand==4 & zuzugjahr==. & firstyear==2001 & jahrgang >1955 & jahrgang< 1966
replace zuzugjahr=firstyear-floor((5-1+1) * runiform() + 1) if zivilstand==4 & zuzugjahr==. & firstyear==2001 & jahrgang >1965
xttab zivilstand if zuzugjahr==.

tab zuzugjahr, mi // still 6% missing
tab zuzugjahr if zivilstand==4, mi 

				tab firstyear jahrgang if zivilstand==4 & zuzugjahr==. & sex==0 // what to do with remaining 155 divorced men?
				tab firstyear jahrgang if zivilstand==4 & zuzugjahr==. & sex==1 //  what to do with remaining 160 divorced women?
				// for methodological consistency, I replace their date with birthyear, but one could as a robustness check use firstyear as zuzugjahr
				//replace zuzugjahr=firstyear if zivilstand==4 & zuzugjahr==. & firstyear!=2001
replace zuzugjahr=jahrgang if zivilstand==4 & zuzugjahr==. //(315 real changes made)
replace zuzugjahr=firstyear-5 if zivilstand==4 & zuzugjahr==. & jahrgang==. //O obs with no jahrgang and no zuzugjahr

* How many are still missing a move-in date?
replace zuzugjahr=99999 if zuzugjahr==.
xttab zuzugjahr //  missing |   18233 obs, 5.85%   ;  1953 panels 5.03%
xttab zuzugjahr if zivilstand == 0 // 2% of the single obs, 1.5% of the single panels, have a missing move-in date
xttab zuzugjahr if zivilstand == 1 // 11% of the married obs, 10.7% of the married panels, have a missing move-in date
xttab zuzugjahr if zivilstand == 2 | zivilstand == 3 // 8.5% of the separated obs, 6.4% of the separated panels, have a missing move-in date
replace zuzugjahr=. if zuzugjahr==99999 // (18233 real changes made, 18233 to missing)

* What properties do those with missing zuzugjahr have?
foreach var in stzugeh_code tarif_code aussch_code aufenthaltsbew {
tab `var' , mi
tab `var' if zuzugjahr ==., mi
}
// stzugeh_code: mostly sekundaer; aussch_code: interkantonale und internationale; tarif_code: mostly married or expenditure; aufenthaltsbew: mainly missing
tab zuzugjahr if stzugeh_code==2 & aussch_code==3, mi //32% missing obs
tab zuzugjahr if stzugeh_code==2 & aussch_code==4, mi //43% missing obs

replace zuzugjahr=99999 if zuzugjahr==.
xttab zuzugjahr if stzugeh_code==2 & aussch_code==3 // 24% of panels missing
xttab zuzugjahr if stzugeh_code==2 & aussch_code==4 // 34% of panels missing
xttab zuzugjahr if stzugeh_code==2 & aussch_code==4 | aussch_code==3 // together they make 16163 (88.6%) of the observations with missing move-in date --> because they maybe never moved in?
replace zuzugjahr=. if zuzugjahr==99999 // (18233 real changes made, 18233 to missing)

gen ausscheid_sekundaer=(stzugeh_code==2 & aussch_code==4 | aussch_code==3)
	label var ausscheid_sekundaer "Sekundaer Steuerpflichtige mit interkantonaler oder internationaler Ausscheidung"
tab zuzugjahr ausscheid_sekundaer, mi row //88.7% of observations with missing zuzugjahr probably never acutally moved to OW, but had an Ausscheidung
tab zuzugjahr ausscheid_sekundaer, mi col //26% of the observations with ausscheid_sekundaer==1 do not have a move-in date
*
tab zuzugjahr if zuzugkt==6, mi // never missing
tab firstyear if zuzugkt==6, mi // 2001 over-represented
corr zuzugjahr firstyear if zuzugkt==6 // much higher correlation than overall
corr zuzugjahr firstyear

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 

*-------------------------------------------------------------------
** CHECK WHAT'S GOING ON WITH THE MOVE-IN DATES
*-------------------------------------------------------------------

* sysmtematic comparison of move-in variable with first-appearance variable
gen testyear= firstyear-zuzugjahr // (18233  missing values generated)
	label var testyear "dummy (-1, 1) evaluating sign of [firstyear - zuzugjahr]"
	replace testyear = -1 if testyear<0 // (5587 real changes) : pple moved in after their first appearance in the data --> could be pple that moved within OW
	replace testyear = 1 if testyear>0 & testyear <. // (6760 real changes): people moved in before their first appearance in the data --> this could be women (mainly?) who divorced or became widows or people born in OW
tab testyear, mi
sum testyear 
sum testyear if testyear ==0 //  280624   obs are ok
sum testyear if testyear !=0 //   13082   obs not ok, some missing
display _N					// N=311950   but 18233 obs have no move-in date

sum testyear if testyear !=0 & zuzugkt==6 //  4016 of the strange obs: OW origin
sum testyear if testyear !=0 & zuzugkt!=6 //  9066  of strange obs: non-OW origin


gen check_zuzugjahr = testyear
	label var check_zuzugjahr "based on testyear, but progressively checked & corrected"
	note check_zuzugjahr: if 0, the variable zuzugjahr is ok and correct (despite the fact that sometimes testyear!=0)
	note check_zuzugjahr: if -1,people were observed BEFORE their move-in date
	note check_zuzugjahr: if  1,people were observed AFTER their move-in date (not possible in dataset version 7)
	note check_zuzugjahr: the variable is missing if zuzugjahr is missing; this seems reasonable: zuzugjahr is neither ok nor wrong if it is missing, the first observation may then be the second best option.
//br if check_zuzugjahr==-1 & zuzugkt!=6
//br if check_zuzugjahr==1 & zuzugkt!=6
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 


* REASONS

* 2) for those who moved within OW, one would expect check_zuzugjahr== -1
		tab zuzugkt if check_zuzugjahr== -1, mi 
	/*
		Note: indeed, for 64% of those with -1, they moved-in from OW
			tab zuzugjahr if zuzugkt==6, mi // none of the OW-movers has missing zuzugjahr
			tab zuzugkt if zuzugjahr==2001, mi //29% from OW
			xttab zuzugjahr 				
			xttab zuzugjahr if zuzugkt==6 	
			xttab zuzugjahr if zuzugkt!=6 	
	*/
  * generate a dummy for those who moved within OW 
	gen mover_OW=(check_zuzugjahr<0 & zuzugkt==6)
        label var mover_OW "dummy for those moving within OW"
	replace check_zuzugjahr=0 if check_zuzugjahr<0 & zuzugkt==6 // (4016 real changes made)
	/*
	xttab check_zuzugjahr
	tab check_zuzugjahr, mi
	*/		
	

* 3) some with testyear==-1 were only sekundaer steuerpflichtig (stzugeh_code==2) until zuzuzgdatum, but these are very few!
bys persid (year): replace check_zuzugjahr = 0 if stzugeh_code[1]==2 & testyear==-1 //(588 real changes made)
bys persid (year): replace check_zuzugjahr = 0 if check_zuzugjahr[_n-1]== 0 & testyear==-1 //(0 real changes made)
bys persid (year): gen sekundaerpfl=(stzugeh_code[1]==2 & testyear==-1)
	label var sekundaerpfl "sekundaer-Pflichtige until zuzugdatum"
tab sekundaerpfl, mi
xttab sekundaerpfl



* 4) some had been residents, moved out for a while, and then came back, i.e., year is not consecutive
	// -> find consecutive spells 
    //ssc install tsspell
			tsspell persid
			egen nspells = max(_spell), by(persid)
			by persid: egen maxspell = max(_seq)
			label var nspells "number of spells"
			label var maxspell "longest spell"
								
	// look at those with more than 1 spell
	
	/* 
	xttab nspells

	sum zuzugjahr if nspells>1, de

	xttab zuzugjahr if zuzugjahr <2001 & nspells>1
		Note: it's only a problem for a total of 61 tax units, 460 obs
	xttab zuzugjahr if nspells==2 // 1234 tax units, 8138 obs have 2 spells
	xttab zuzugjahr if nspells==3 // 44 tax units, 265 obs have 3 spells
	
	* What do these people look like in terms of income and wealth?
	sum z380s1 z480s1 if nspells==3, de
	sum z380s1 z480s1, de
		Note: those with 3 spells have below averge incomes and wealth, at least
		some of them seem to be students.
		Also those with 2 spells are somewhat lower in income and wealth, but seem
		to be more comparable to those with 1 spell.
	*/


* Now gen a firstyear-variable for each spell
sort persid year
forvalues x=1/3 {
	display _newline(3)
	display `x'
	gen firstyear`x'=year if _spell==`x' & _seq==1
	label var firstyear`x' "first appearance in data in spell `x'"
	by persid: replace firstyear`x'= firstyear`x'[_n-1] if firstyear`x'==.		
	  forvalues z=1/13 {
		 local i=`z'
		 by persid: replace firstyear`x'= firstyear`x'[_n+1] if firstyear`x'==.
	  }
	}
/*	
tab firstyear1 nspells, mi
     first |
appearance |
in data in |         number of spells
   spell 1 |         1          2          3 |     Total
-----------+---------------------------------+----------
      2001 |   221,919      6,143        218 |   228,280 
      2002 |    10,832        676         40 |    11,548 
      2003 |    10,315        501          0 |    10,816 
      2004 |     9,768        309          0 |    10,077 
      2005 |     8,812        242          7 |     9,061 
      2006 |     8,803        185          0 |     8,988 
      2007 |     7,668        132          0 |     7,800 
      2008 |     7,051         59          0 |     7,110 
      2009 |     6,104         51          0 |     6,155 
      2010 |     4,851         21          0 |     4,872 
      2011 |     3,494          0          0 |     3,494 
      2012 |     2,516          0          0 |     2,516 
      2013 |     1,222          0          0 |     1,222 
-----------+---------------------------------+----------
     Total |   303,355      8,319        265 |   311,939 



     
tab firstyear2 nspells, mi // they all have 2 or more spells, if 1 spell, firstyear2 is missing -->ok

tab firstyear3 nspells, mi // they all have 3 spells, if 1 or 2 spells, firstyear2 is missing -->ok

*/

xttab zuzugjahr if mover_OW != 1

	* BEFORE I CAN REPLACE THESE PEOPLE'S DATES, I HAVE TO MAKE SURE THEY DID NOT CHANGE CIVIL STATUS
	tab switch nspells
	tab switch nspells if sex==1 // women
	tab switch nspells if sex==0 // men
	// especially with the women the concern is, that they had several spells because they switched
	tab nspells if switch==1 & _seq==1 // 0 observations, so that seems to be ok!
	tab _seq switch // confirms that 0 switches happened in first _seq of a spell, so ok!


* generate a move-in variable taking into account several spells
gen zuzugjahre=zuzugjahr //(18233  missing values generated)
	label var zuzugjahre "all move-in dates, including those of several-movers"
	note zuzugjahre: this variable should not be used as move in date together with the information on canton of origin: this latter will only correspond to the last move-in date (probably)

replace check_zuzugjahr = 0 if nspells==2 & firstyear2==zuzugjahr & zuzugjahr!=. //(630  real changes made)
replace check_zuzugjahr = 0 if _spell==2 & firstyear2==zuzugjahr & zuzugjahr!=. //(0 real changes made) 

gen nspells2= (nspells==2 & firstyear2==zuzugjahr  & zuzugjahr!=. )
replace nspells2=1 if _spell==2 & firstyear2==zuzugjahr & zuzugjahr!=. //(0 real changes made) 
replace zuzugjahre=firstyear2 if nspells2==1 & year>=firstyear2 //(0 real changes made (some were maybe already ok?))

xttab zuzugjahre if mover_OW != 1 //still ok!
xttab zuzugjahre // and this is ok too, no excess in 2001
	
	/*
	hist zuzugjahre if nspells>1,  width(1) 
	hist zuzugjahre if nspells>1 & zuzugjahre<=2012,  width(1) 
	hist zuzugjahre if nspells>1 & zuzugjahre<=2014 & zuzugjahre>=1995,  width(1) 
	tab zuzugjahre, mi 
	*/
	
		
replace check_zuzugjahr = 0 if nspells==3 & firstyear3==zuzugjahr & zuzugjahr!=. //(26 real changes made)
replace check_zuzugjahr = 0 if _spell==3 & firstyear3==zuzugjahr & zuzugjahr!=. //(0 real changes made) 

gen nspells3= (nspells==3 & firstyear3==zuzugjahr  & zuzugjahr!=.)
replace zuzugjahre=firstyear3 if nspells3==1 & year>=firstyear3 //(0 real changes made)
replace zuzugjahre=firstyear2 if nspells==3 & year>=firstyear2 & year<firstyear3 //(73 real changes made)

xttab zuzugjahre if mover_OW != 1 //still ok!
xttab zuzugjahre // and this is ok too, no excess in 2001




/*
xttab check_zuzugjahr

                  Overall             Between            Within
check_z~r |    Freq.  Percent      Freq.  Percent        Percent
----------+-----------------------------------------------------
       -1 |    1059      0.36      1059      2.87          22.31 // observed before move-in date
        0 |  285884     97.34     35971     97.55          99.48
        1 |    6763      2.30       857      2.32         100.00 // observed after move-in date
----------+-----------------------------------------------------
    Total |  293706    100.00     37887    102.74          97.33
                             (n = 36876)

*/





* 8) Ausscheidungen = tax allocation across cantons &/or countries

* Where do these people actually live if they are subject to international tax allocation?
* Genereate wohnort_OW
log using "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/log_setup/wohnorte_tablle", replace name(wohnorte_tablle)
tab wohnort
log close wohnorte_tablle

gen wohnort_OW=0
local list1 = "Alpnachstad Engelberg Giswil Kerns Lungern Sachseln Sarnen  Ramersberg Stalden Wilen K�giswil"
local s= wordcount("`list1'")
display `s'
forv c = 1/`s' {
  local village= word("`list1'",`c')
  display "The town `village' is substringed now"
  
local n=length("`village'")
  forv i = 2/`n' {
    local name`i' = substr("`village'",1,`i')
    dis "`i', `name`i''"
    replace wohnort_OW=1 if wohnort=="`name`i''" & wohnort!=""
  }
}

* deal with Alpnach Dorf
local village = "Alpnach Dorf"
local n=length("`village'")
disp `n'
  forv i = 9/`n' {
    local name`i' = substr("`village'",1,`i')
    dis "`i', `name`i''"
    replace wohnort_OW=1 if wohnort=="`name`i''" & wohnort!=""
  }

* deal with Wilen (Sarnen) 
local village = "Wilen (Sarnen)"
local n=length("`village'")
disp `n'
  forv i = 7/`n' {
    local name`i' = substr("`village'",1,`i')
    dis "`i', `name`i''"
    replace wohnort_OW=1 if wohnort=="`name`i''" & wohnort!=""
  }

* deal with Stalden (Sarnen)
local village = "Stalden (Sarnen)"
local n=length("`village'")
disp `n'
  forv i = 9/`n' {
    local name`i' = substr("`village'",1,`i')
    dis "`i', `name`i''"
    replace wohnort_OW=1 if wohnort=="`name`i''" & wohnort!=""
  }

gen wohnort_OW_oneletter=0
label var wohnort_OW_oneletter `"considered as OW resident, even though wohnort had 1 letter only (A E G K L S R W)"'

local l = "A E G K L S R W"
forva c = 1/9 {
  local village = word("`l'", `c')
  replace wohnort_OW_oneletter=1 if wohnort=="`village'" & wohnort!=""
}

gen wohnort_OW1=wohnort_OW + wohnort_OW_oneletter if wohnort_OW_oneletter!=. //1047
note wohnort_OW1: There are observations with only 1 letter in the village name, these are considered as OW too (==1), if they start with a OW village letter
label var wohnort_OW1 "OW resident, based on village names and abbreviations (including 1 letter only)"
label var wohnort_OW "OW resident, based on village names and abbreviations (min of 2 letters)"

*-------------------------------------------------------------------
* FINAL CHECK OF MISSING ZUZUGJAHRE
*-------------------------------------------------------------------
tab wohnort_OW if aussch_code>0 //  21,028 had Ausscheidung and were living in OW
tab wohnort_OW if ausscheid_sekundaer>0 //12,104 had Ausscheidung, were sekundaer steuerpflichtig and lived in OW
tab wohnort_OW if zuzugjahr==. & aussch_code>0 // 90 had Ausscheidung and were living in OW
tab wohnort_OW if zuzugjahr==. & ausscheid_sekundaer>0 // 87  had Ausscheidung, were sekundaer steuerpflichtig and lived in OW
tab wohnort_OW if zuzugjahr!=. & aussch_code>0 // 20,938  had Ausscheidung and were living in OW

tab zuzugjahre , mi // (311,939   obs), 18,233  missing
tab zuzugjahre if wohnort_OW==1 , mi // looks good (206,184 obs) 383 missing
tab zuzugjahre if zuzugkt!=6, mi //  (162,398 obs), 18,233  missing (11.23%)
tab zuzugjahre if wohnort_OW==1 & zuzugkt!=6, mi // looks good (84,049  obs)
tab zuzugjahre if wohnort_OW==1 & zuzugkt!=6 & aussch_code==0, mi // looks very similar, but only 72,102  obs 

tab check_zuzugjahr wohnort_OW
xttab wohnort_OW //66% of obs, 59% of panels live in OW

twoway (kdensity zuzugjahr) || (kdensity zuzugjahre), xlabel(1900(10)2020, grid) legend(label(1 "kdensity zuzugjahr") label(2 "kdensity zuzugjahre") rows(2)) name(gr_1)
// zuzugjahre and zuzugjahr are essentially the same

set scheme mine
twoway (kdensity zuzugjahr) || (kdensity zuzugjahr if wohnort_OW==1) || (kdensity zuzugjahr if wohnort_OW==1 & zuzugkt!=6), xlabel(1900(10)2020, grid) xline(2006) ///
legend(label(1 "Full sample") label(2 "Living in OW") label(3 "Living in OW and coming from outside") rows(3)) name(gr_4) ///
xtitle("Moving year") ytitle("Kernel density")

replace zuzugjahr=1885 if zuzugjahr==.
twoway (kdensity zuzugjahr) || (kdensity zuzugjahr if wohnort_OW==1) || (kdensity zuzugjahr if wohnort_OW==1 & zuzugkt!=6), xlabel(1900(10)2020, grid) xline(2006) ///
legend(label(1 "Full sample") label(2 "Living in OW") label(3 "Living in OW and coming from outside") rows(3)) name(gr_5, replace) ///
xtitle("Recorded moving year") ytitle("Kernel density") xlabel(1885 "missing" 1900 "1900" 1910 "1910" 1920 "1920" 1930 "1930" 1940 "1940" 1950 "1950" 1960 "1960" 1970 "1970" 1980 "1980" 1990 "1990" 2000 "2000" 2010 "2010" 2020 "2020")
graph export "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/log_setup/density_zuzug_variable.pdf", as(pdf) replace
replace zuzugjahr=. if zuzugjahr==1885


*-------------------------------------------------------------------
* GENERATE MOVE-OUT DATE using _end from tsspell
*-------------------------------------------------------------------
* for those with several spells, the endyear should, in a first step be, the LAST endyear
gen wegzugjahre=.
gen wegzugjahr_uncorr=.
gen wegzugjahr=.

forvalues x=1/3 {
 gen endyear`x' = year if _end==1 & _spell==`x' // & year!=2013
 label var endyear`x' "last year of appearance in data in spell `x'"
 by persid: replace endyear`x'= endyear`x'[_n-1] if endyear`x'==.
 
	forvalues z=1/13 {
	 local i=`z'
	 by persid: replace endyear`x' = endyear`x'[_n+1] if endyear`x' ==.
	}
 
  	replace wegzugjahre=endyear`x'+1 if _spell==`x' & endyear`x'!=.  
  	replace wegzugjahr_uncorr=endyear`x'+1 if endyear`x'!=. 
 }

foreach var in wegzugjahre wegzugjahr wegzugjahr_uncorr{
 replace `var' =0  if  `var'==2014
}

label var wegzugjahr_uncorr "last move-out date (last year in data+1)"
note wegzugjahr_uncorr: based on last appearance in the data + 1 (since move-out happened in the year AFTER last apperance). 0 means that by 2013, the taxpayer was still registered in OW.
note wegzugjahr_uncorr: People might disappear without moving-out: marriage or death.
label var wegzugjahre "all move-out dates, including those of several-movers"
note wegzugjahre: based on last appearance in the data of each spell + 1 year (since move-out happened in the year AFTER last apperance). 0 means that by 2013, the taxpayer was still registered in OW.
note wegzugjahre: People might disappear without moving-out: marriage or death.



/*
 tab wegzugjahr_uncorr  
	  
       last |
   move-out |
 date (last |
    year in |
    data+1) |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |    247,159       79.25       79.25
       2002 |      1,010        0.32       79.57
       2003 |      2,130        0.68       80.25
       2004 |      3,264        1.05       81.30
       2005 |      3,624        1.16       82.46
       2006 |      4,366        1.40       83.86
       2007 |      4,695        1.51       85.37
       2008 |      6,106        1.96       87.32
       2009 |      7,018        2.25       89.57
       2010 |      7,508        2.41       91.98
       2011 |      7,908        2.54       94.52
       2012 |      8,708        2.79       97.31
       2013 |      8,392        2.69      100.00
------------+-----------------------------------
      Total |    311,888      100.00


*/


note  wegzugnachbfs: diese Variable ist eine Mischung aus BFS-Gemeindenummern und Ländercodes
note  zuzugvonbfs: diese Variable ist eine Mischung aus BFS-Gemeindenummern und Ländercodes


***** Take into account that some people disappear becuase they die
/* if in the year the person moved out (-1), 
the person is older than 70,
and wegzugnachbfs==.
 consider them as deceased.
*/

* gen an indicator for those whom I assume they died
gen dead=.
replace dead=1 if wegzugjahr_uncorr-1==year & age>70 & unterj2==1 & wegzugnachbfs==. //(2361 real changes made)
replace dead=0 if _spell==1 & nspells>1 & dead==1
replace dead=0 if _spell==2 & nspells>2 & dead==1
label var dead "Dummy indicating that someone might have died in that year"

forvalues x=1/13 {
 local i=`x'
 bys persid: replace dead= dead[_n+1] if dead==.
}
replace dead=0 if dead==.

replace wegzugjahr=wegzugjahr_uncorr
replace wegzugjahr=99 if dead==1
replace wegzugjahre=99 if dead==1 

label var wegzugjahr "move-out year (last year in data+1), accounting for deaths"
note wegzugjahr: For everyone below 71, this is the last year of observation (for each spell). For pple older than 70, I assume they died when they disappear during the year (unterj2==1), indicated by value 99. value 0: did not move out.


replace wegzugjahr=. if zuzugjahr==.
replace wegzugjahre=. if zuzugjahre==.

*-------------------------------------------------------------------
* CALCULATE TAXES AND TAX LOAD 
*-------------------------------------------------------------------
//  merge the federal taxable income
//merge m:1 year persid using "/Volumes/OW_data/dbst_nodup.dta", gen(merge_dbst)
merge m:1 year persid using "/Volumes/OW_data/dbst_nodup.dta", gen(merge_dbst)
	label var merge_dbst "Federal taxable income merged to OW data"
// calculate tax rates
run "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/taxrates/calc_taxrates.do"
run "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/taxrates/calc_taxrates_hypothetical_h1.do"
run "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/taxrates/calc_taxrates_hypothetical_h2-new.do"
run "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/taxrates/calc_mtr.do"
run "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/taxrates/calc_mtr_dbst.do"
gen mtr_total = mtr_multpl + mtr_dbs
	label var mtr_total "Marginal tax rate including cantonal, municipal and federal income taxes"
	
*-------------------------------------------------------------------
* GENERATE FURTHER INDICATOR VARIABLES
*-------------------------------------------------------------------
run  "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/more/indicator-variables.do"

cap drop _merge


*-------------------------------------------------------------------
* ADD THE DISTANCE TO THE FORMER MUNICIPALITY OF RESIDENCE
*-------------------------------------------------------------------
run "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/distance/match_distance.do"
gen dist_km=dist_m/1000
    label var dist_km "driving distance in km"
gen dist_min=dist_s/60
    label var dist_min "drivig time in m"

*-------------------------------------------------------------------
* ESTIMATE THE TAX RATE IN FORMER MUNICIPALITY OF RESIDENCE
*-------------------------------------------------------------------
* generate a gross income variable for everyone 
codebook z199s1, mv
    //Problem is that z199s1 is missing for 42,458 observations
    
    // for those where gross income z199s1 is not missing, estimate how much it differs from taxable income
    gen ratio_grosstaxable = z199s1/esteuerbst
        label var ratio_grosstaxable "ratio gross income / taxable income (z199s1/esteuerbst)"
    sum ratio_grosstaxable, de // top 1% >263
    sum esteuerbst, de  //50% have a taxable income < 25K

    binscatter ratio_grosstaxable esteuerbst , name(bs_1, replace) 
    binscatter ratio_grosstaxable esteuerbst  if esteuerbst >20000 &  ratio_grosstaxable<263 & z199s1>0, linetype(connect) nq(100)  name(bs_2, replace)
    binscatter ratio_grosstaxable esteuerbst  if esteuerbst >20000 &  ratio_grosstaxable<263 & z199s1>0, linetype(connect) nq(50)   name(bs_3, replace) 
    binscatter ratio_grosstaxable esteuerbst  if esteuerbst >25000 &  ratio_grosstaxable<263 & z199s1>0, linetype(connect) nq(100)  name(bs_4, replace)
    binscatter ratio_grosstaxable esteuerbst  if esteuerbst >70000 &  ratio_grosstaxable<263 & z199s1>0, linetype(connect) nq(100)  name(bs_5, replace)
    binscatter ratio_grosstaxable esteuerbst  if esteuerbst >93000 &  ratio_grosstaxable<263 & z199s1>0, linetype(connect) nq(50)   name(bs_6, replace)
    binscatter ratio_grosstaxable esteuerbst if  ratio_grosstaxable<10 & z199s1>0, name(bs_7, replace) nq(50) genxq(taxinc_vint_nomiss)

    // calculate percentile of distribution of taxable income
    xtile taxinc_vint= esteuerbst , nq(50) altdef
        label var taxinc_vint "percentile taxable income (nq=50, esteuerbst >20000, all years pooled)"

    // replace ratio by percentile average where ratio is missing
    sum ratio_grosstaxable, de // top 1% >263, tp 25% >2.4
    sum ratio_grosstaxable if ratio_grosstaxable < 1 // only 27 obs <1

    gen ratio_grosstaxable_nm = ratio_grosstaxable if ratio_grosstaxable < 3 & ratio_grosstaxable>=1

    sum ratio_grosstaxable if taxinc_vint <11 & ratio_grosstaxable < 3 & ratio_grosstaxable>=1
    replace ratio_grosstaxable_nm= `r(mean)' if taxinc_vint <11 & ratio_grosstaxable_nm==.

    forval l=11/50 {
    sum ratio_grosstaxable if taxinc_vint== `l' & ratio_grosstaxable < 3 & ratio_grosstaxable>=1
    replace ratio_grosstaxable_nm= `r(mean)' if taxinc_vint==`l' & ratio_grosstaxable_nm==.
    }

    sum ratio_grosstaxable_nm, de // top 1% >263, tp 25% >2.4
        label var ratio_grosstaxable_nm  "ratio gross income / taxable income, missings imputed"


// generate variable to merge tax data
gen y_gross = ratio_grosstaxable_nm * esteuerbst
    label var y_gross "Gross taxable income measure to impute tax in former canton of residence"

gen y_tax = floor(y_gross/1000)
    label var y_tax "gross income to merge with tax load"

gen l_dummy	= ( married==0 & (snpanzki==0 | snpanzki==.))
    label var l_dummy "dummy: single taxpayer"
gen v2k_dummy =  ( married==1 & snpanzki>0 & snpanzki<.)
    label var v2k_dummy "dummy: married w/ childeren taxpayer"
gen v0k_dummy = ( married==1 & (snpanzki==0 | snpanzki==.))
    label var v0k_dummy "dummy: married w/o chilrdren taxpayer"
// some people were not assigned to any group
tab snpanzki married if (l_dummy ==0 & v2k_dummy==0 & v0k_dummy==0)
    //->the single parents! I'll tax them like married with children
gen lk_dummy = ( married==0 & snpanzki>0 & snpanzki<.)
tab snpanzki married if (l_dummy ==0 & v2k_dummy==0 & v0k_dummy==0 & lk_dummy==0)
//no observations

drop if year>2010
replace y_tax=1000 if y_tax>1000 // assume constant average tax above 1 mio CHF, as no tax rate data are available beyond this income

// account for municipality merges and bring municpality of origin on same level as in taxload data (Nov. 2010)
rename zuzugvonbfs gdenummer
sort gdenummer
cap drop _merge
forvalues i=1(1)4{
merge m:1 gdenummer using "/Users/isabel/Dropbox/swisstaxholiday/data-documentation/municipality_merges/mutations.dta", gen(_merge)
qui: replace gdenummer = gdenummer_new if gdenummer_new != .
drop if _merge==2 
drop gdenummer_new _merge
sort gdenummer
}
rename gdenummer zuzugvonbfs

// merge the tax rate data
merge m:1 year y_tax zuzugvonbfs using "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/tax_at_origin/taxloaddata.dta", ///
            keepusing(l_ipol v2k_ipol v0k_ipol) keep(master match)
    
            tab year if _merge==1 //60K observations not matched
            tab year if _merge==1 & zuzugvonbfs ==. //23K observations where former municipality of residence is unknown
            tab year if _merge==1 & zuzugvonbfs >8200 &  zuzugvonbfs<. //36K observations where former municipality of residence is abroad
                        // 59'949 observations where former municipality is either unknonwn or abroad
            tab year if _merge==1 & zuzugkt==6 //38 observations where former municipality of residence is in OW

            br if _merge==1 & zuzugvonbfs<8201 //291 obs not merged even though some former municipality is known, but municipality id is strange...

log using "/Users/isabel/Dropbox/TaxMobility_OW/data/taxdata_OW/setup_data/log_setup/zuzugvonbfs_notmatched2", replace name(zuzugvonbfs_notmatched2)
tab zuzugvonbfs if _merge==1 & zuzugvonbfs<8201 & y_tax<=1000
tab zuzugvonbfs if _merge==1 & zuzugvonbfs<8201 & y_tax<=1000, nolab
log close zuzugvonbfs_notmatched2

gen form_avg_tax =.
    label var form_avg_tax "Est. avg. cantonal+municipality+church tax rate in former municipality of residence"
replace form_avg_tax = l_ipol if l_dummy ==1
replace form_avg_tax = v2k_ipol if v2k_dummy ==1
replace form_avg_tax = v2k_ipol if lk_dummy ==1
replace form_avg_tax = v0k_ipol if v0k_dummy ==1


br if _merge==3 & form_avg_tax==. //988 obs where tax rate was merged but former tax rate is missing
br if _merge==3 & form_avg_tax==. & (l_dummy !=0 | v2k_dummy!=0 | v0k_dummy!=0 | lk_dummy!=0) //988 obs where the dummy is correct, but the tax rate is missing (why?)

note form_avg_tax: "59'949 observations where former municipality is either unknonwn or abroad"
note form_avg_tax: "988 obs where the hh-type dummy is correct, but the tax rate was missing (not clear why it was missing)"
note form_avg_tax: "Single parents are assumed to be taxed like married with children"
note form_avg_tax: "Single parents and married with children have the same tax rate, irrespective of the exact number of children"

// tax savings
gen taxsavings = (form_avg_tax - avgt_stekKGKi_taxable*100)*esteuerbst/1000
    label var taxsavings "Tax savings from moving (in 1000 CHF)"

sum taxsavings, de
sum taxsavings if year>2005, de
sum taxsavings if year>2005 & year<2008 , de
sum taxsavings if year>2005 & year<2008 & zuzugjahr>2005, de
sum taxsavings if year>2005 & year<2008 & zuzugjahr>2005 & esteuerbst>= 300000 & esteuerbst <., de
sum taxsavings if year>2005  & zuzugjahr>2005 & esteuerbst>= 300000 & esteuerbst <., de

bys persid: egen meantaxsavings = mean(taxsavings)
    label var meantaxsavings "Average annual tax savings from moving"
sum meantaxsavings, de
sum meantaxsavings if year>2005, de
sum meantaxsavings if year>2005 & year<2008, de
sum meantaxsavings if year>2005 & year<2008 & zuzugjahr>2005, de
sum meantaxsavings if year>2005 & year<2008 & zuzugjahr>2005 & esteuerbst>= 300000 & esteuerbst <., de
sum meantaxsavings if year>2005  & zuzugjahr>2005 & esteuerbst>= 300000 & esteuerbst <., de




* * * * * FINISH * * * * *

cap drop _merge

compress
save "/Volumes/OW_data/my_dta_v8.1.dta", replace

log close

* * * END OF DO-FILE * * * * * * END OF DO-FILE * * * * * * END OF DO-FILE * * * 
